library(rmapshaper)
library(censusapi)
library(shapefiles)
library(base)
library(tidyverse)
library(tigris)
library(sf)
library(mapview)
library(raster)
library(rlang)
library(rgeos)
library(data.table)
library(measurements)
library(smoothr)
library(lwgeom)
library(units)
library(geosphere)
library(kableExtra)
library(leaflet)
library(htmltools)
library(plotly)

mapviewOptions(
  basemaps = "OpenStreetMap"
)
options(
  tigris_class = "sf",
  tigris_use_cache = TRUE
)

Sys.setenv(CENSUS_KEY="b88495f8315f45b2d100dee9ba8f4c489a7371c2")

projection <- "+proj=utm +zone=10 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=ft +no_defs"

`%notin%` <- Negate(`%in%`)

Introduction

Does the location of where one lives impact the ability to enjoy outdoor green space? Does the size of the yard at your house play a factor in whether you regularly visit parks? Are communities under-visiting parks that are physically closer to them? The goal of this analysis is to provide insight on park accessibility within communities in the City of San Jose. We will consider factors such as backyard size, park visit frequency, and distance from a park to determine a community’s park accessibility.

The first few sections describe our methodological approaches for preparing the factors for analysis, followed by sections detailing the comparisons between factors and the resulting conclusions.

Safegraph Data Collection Explanation

The following describes how the Safegraph data is processed:

Safegraph collects device visit information to “places of interest” (POI) like parks or grocery stores and includes some information about a device’s origin census block group. The visit counts in the Safegraph dataset fall into either one of these three cases:

Note: We obtained all of the data for the park visits portion of the analysis from Safegraph Patterns Data.

Methodology: Collecting San Jose Residential Yard Sizes

Our goal for this section is to obtain the average residential yard size for each census block group in San Jose. We start with parcel data from the Santa Clara County Assessor’s Office, and filter it down to only include residential zoned parcels. The filtering criteria is based on property use codes that are classified as residential. The following list details the land use codes we used for this analysis:

After filtering to parcels with these residential use codes, we spatially match the building footprint shape associated with that parcel. To get from parcel area to yard area, we subtract the building footprint shape out of the parcel shape.

#load("P:/SFBI/Data Library/OSM/osm_bldg.Rdata")
# 
#osm_bldg <-
#  osm_bldg %>% st_transform(projection)

sj_boundary <-
  places("CA", cb=FALSE) %>%
  filter(NAME == "San Jose") %>%
  st_transform(projection)
# 
sj_blockgroup <-
  readRDS("sj_blockgroups.rds") %>%
  st_transform(projection)

#mapview(sj_blockgroup)


#load("P:/SFBI/Data Library/California Blocks/ca_blocks_lite.Rdata")
#ca_blocks_lite <- ca_blocks_lite %>% st_transform(projection)
#sj_blocks <- ca_blocks_lite[sj_boundary,]
#save(sj_blocks, file = "sj_blocks.Rdata")
#load("sj_blocks.Rdata")

#mapview(sj_blocks[100,])

# sj_bldg <-
#   osm_bldg[sj_boundary,] %>% 
#   transmute(
#     index = row_number()
#   )

#save(sj_bldg, file = "sj_bldg.Rdata")
load("sj_bldg.Rdata")

# scc_parcels <- st_read("P:/SFBI/Data Library/Parcels/santa clara/scc.shp")
# 
# scc_parcels <- 
#   scc_parcels %>%
#   st_transform(projection)
# 
# This includes parcels parcels at the periphery of San Jose. This is important for the "block method" used to find street edges. otherwise this sf object isn't used.
# sj_parcels_plus <-
#   scc_parcels[sj_blocks,] %>%
#   dplyr::select(APN, USE_CODE, SITUS_CITY)
# # 
# sj_parcels <-
#   sj_parcels_plus %>%
#   filter(SITUS_CITY == "SAN JOSE")
# #
# save(sj_parcels_plus, sj_parcels, file = "sj_parcels.Rdata")

load("sj_parcels.Rdata")


# apn_bg_lookup <- data.frame(APN = NA, origin_census_block_group = NA)
# 
# for(i in 1:nrow(sj_blockgroup)){
#   if(i %% 10 == 0){print(i)}
#     temp <- sj_blockgroup[i,]
#     temp2 <- sj_parcels[temp,] %>%
#       dplyr::select(APN) %>%
#       st_set_geometry(NULL) %>%
#       mutate(
#         origin_census_block_group = temp$origin_census_block_group
#       )
#     apn_bg_lookup <-
#       apn_bg_lookup %>%
#       rbind(temp2)
# }
# 
# apn_bg_lookup <-
#   apn_bg_lookup %>%
#   na.omit()
# 
# saveRDS(apn_bg_lookup,"sj_apn_bg_lookup.rds")

apn_bg_lookup <- readRDS("sj_apn_bg_lookup.rds")



sj_parcels_residential <-
  sj_parcels %>%
  mutate(
    USE_CODE = as.numeric(USE_CODE)
  ) %>%
  filter(USE_CODE %in% c(1,2,3,4,6,7))


#mapview(head(sj_parcels_residential,100))

matched_bldg_footprint <-
  sj_bldg %>%
  st_centroid() %>%
  st_intersection(sj_parcels_residential) %>%
  st_set_geometry(NULL) %>%
  left_join(sj_bldg, by = "index") %>%
  st_as_sf()
# 
# 
residential_parcels <-
  sj_parcels_residential %>%
  mutate(
     parcel_area = st_area(.) %>% as.numeric()
  ) %>%
  as.data.frame() %>%
  st_as_sf() %>%
  rename(parcel_geometry = geometry)
# 
# 
residential_bldg <-
  residential_parcels %>%
  as.data.frame() %>%
  right_join(
    matched_bldg_footprint %>% dplyr::select(APN),
    by = "APN"
  ) %>%
  filter(!is.na(parcel_area)) %>%
  rename(building_geometry = geometry) %>%
  st_as_sf() %>%
  st_set_geometry("building_geometry")
# 
# 
residential_parcels <-
  residential_bldg %>%
  st_set_geometry("parcel_geometry")
# 
residential_parcels <- residential_parcels[!duplicated(residential_parcels), ]

##############################################################
# available_yard <-
#   st_difference(
#     residential_parcels[1:10000,],
#     st_union(
#       residential_parcels$building_geometry[1:10000,])
#  )
# 
# available_yard2 <-
#   st_difference(
#     residential_parcels[10001:20000,],
#     st_union(
#       residential_parcels$building_geometry[10001:20000,])
#  )
# 
# available_yard3 <-
#   st_difference(
#     residential_parcels[20001:30000,],
#     st_union(
#       residential_parcels$building_geometry[20001:30000,])
#  )
# #
# available_yard4 <-
#   st_difference(
#     residential_parcels[30001:40000,],
#     st_union(
#       residential_parcels$building_geometry[30001:40000,])
#  )
# 
# available_yard5 <-
#   st_difference(
#     residential_parcels[40001:50000,],
#     st_union(
#       residential_parcels$building_geometry[40001:50000,])
#  )
# 
# available_yard6 <-
#   st_difference(
#     residential_parcels[50001:60000,],
#     st_union(
#       residential_parcels$building_geometry[50001:60000,])
#  )
# 
# available_yard7 <-
#   st_difference(
#     residential_parcels[60001:70000,],
#     st_union(
#       residential_parcels$building_geometry[60001:70000,])
#  )
# 
# available_yard8 <-
#   st_difference(
#     residential_parcels[70001:80000,],
#     st_union(
#       residential_parcels$building_geometry[70001:80000,])
#  )
# 
# available_yard9 <-
#   st_difference(
#     residential_parcels[80001:90000,],
#     st_union(
#       residential_parcels$building_geometry[80001:90000,])
#  )
# 
# available_yard10 <-
#   st_difference(
#     residential_parcels[90001:100000,],
#     st_union(
#       residential_parcels$building_geometry[90001:100000,])
#  )
# 
# available_yard10 <-
#   st_difference(
#     residential_parcels[100001:nrow(residential_parcels),],
#     st_union(
#       residential_parcels$building_geometry[100001:nrow(residential_parcels),])
#  )


# all_available_yard <-
#   available_yard %>%
#   rbind(available_yard2) %>%
#   rbind(available_yard3) %>%
#   rbind(available_yard4) %>%
#   rbind(available_yard5) %>%
#   rbind(available_yard6) %>%
#   rbind(available_yard7) %>%
#   rbind(available_yard8) %>%
#   rbind(available_yard9) %>%
#   rbind(available_yard10)



#save(all_available_yard, file = "sj_all_yard_available.Rdata")

load("sj_all_yard_available.Rdata")


all_available_yard_other <-
  all_available_yard %>%
  filter(USE_CODE == 2 | USE_CODE == 3)

all_available_yard_apartment <-
  all_available_yard %>%
  filter(USE_CODE == 4) 
  
all_available_yard_other2 <-
  all_available_yard %>%
  filter(USE_CODE %in% c(6,7))

available_area_fun <- function(df, residential_parcels){
  df %>%
    as.data.frame() %>%
    rename(geometry = parcel_geometry) %>%
    left_join(residential_parcels %>% dplyr::select(APN), by = "APN") %>%
    st_as_sf() %>%
    st_set_geometry("geometry") %>%
    as_Spatial() %>%
    gBuffer(byid = T, width = 0) %>%
    sp::disaggregate() %>%
    st_as_sf() %>%
    rename(available_area_geometry = geometry) %>%
    st_set_geometry("available_area_geometry") %>%
    mutate(
      available_area = round(st_area(.) %>% as.numeric())
    ) %>%
    st_set_crs(projection)
}


# save(available_area, file = "sj_available_backyard_area.Rdata")

#!!!!This .Rdata file constians all of the yard areas for parcels zoned as single-family residential!!!!
load("sj_available_backyard_area.Rdata")
 

# PUC 2,3
available_area1_other <- available_area_fun(all_available_yard_other[1:nrow(all_available_yard_other),], residential_parcels)

available_area1_other <-
  available_area1_other %>%
  dplyr::select(APN, available_area, available_area_geometry) 


#PUC 6,7
available_area2_other <- available_area_fun(all_available_yard_other2, residential_parcels)

available_area2_other <-
  available_area2_other %>%
  dplyr::select(APN, available_area, available_area_geometry) 



available_area_apartment <-
  all_available_yard_apartment %>%
  mutate(
    bldg_area = st_area(building_geometry)
    ) %>%
  group_by(APN, parcel_area) %>%
  summarize(bldg_area_sum = sum(as.numeric(bldg_area))) %>%
  mutate(available_area = parcel_area - bldg_area_sum)

available_area_apartment <-
  available_area_apartment %>%
  rename(available_area_geometry = building_geometry) %>%
  dplyr::select(APN,available_area,available_area_geometry) %>%
  mutate(
    available_area = ifelse(available_area < 0,0,available_area)
  )

#combining all PUC dataframes
available_area_combo <-
  available_area %>%
  rbind(available_area1_other) %>%
  rbind(available_area2_other) %>%
  rbind(available_area_apartment)


nix_list <- c("69403028","41425018","67013004","67013003","67029024","74207011")


nix_df <-
  available_area %>%
  filter(APN %in% nix_list) %>%
  left_join(
    all_available_yard %>%
      st_set_geometry(NULL) %>%
      dplyr::select(APN, USE_CODE) %>%
      filter(APN %in% nix_list),
    by = "APN"
    )


available_area_bg <-
  available_area_combo %>%
  left_join(
    apn_bg_lookup %>%
      filter(APN %in% available_area$APN),
    by = "APN"
  ) %>%
  filter(APN %notin% nix_list)

#  
bg_area_avg <-
  available_area_bg %>%
  st_set_geometry(NULL) %>%
  group_by(origin_census_block_group) %>%
  summarise(yard_area_avg = sum(available_area))
# 
extra_bg <-
  sj_blockgroup %>%
  filter(origin_census_block_group %notin% bg_area_avg$origin_census_block_group) %>%
  st_set_geometry(NULL) %>%
  dplyr::select(origin_census_block_group) %>%
  mutate(yard_area_avg = 0)
# 
bg_area_avg_all <-
  bg_area_avg %>%
  rbind(extra_bg) %>%
  left_join(
    sj_blockgroup %>%
      dplyr::select(-DISTRICTS),
    by = "origin_census_block_group"
  ) %>%
  na.omit() %>%
  st_as_sf() %>%
  st_transform(4326)

  

#save(bg_area_avg_all, file = "sj_bg_avg_yard_area.Rdata")
load("sj_bg_avg_yard_area.Rdata")

sj_population <-
  getCensus(
    name = "acs/acs5",
    vintage = 2018,
    region = "block group:*",
    regionin = "state:06+county:085",
    vars = c("group(B01003)")
  ) %>%
  mutate(
    origin_census_block_group =
      paste0(state,county,tract,block_group)
  ) %>%
  rename(
    total_pop = "B01003_001E"
  ) %>%
  dplyr::select(total_pop, origin_census_block_group)

sj_yard_pop_weight <-
  bg_area_avg_all %>%
  left_join(
    sj_population %>%
      filter(origin_census_block_group %in% bg_area_avg_all$origin_census_block_group),
    by = "origin_census_block_group"
  ) %>%
  mutate(
    yard_per_capita = yard_area_avg / total_pop
  )

The following map shows the void building footprint within the parcel shape for one block group. The resulting shape represents the yard area.

example_bg_yard <-
  available_area_bg %>%
  filter(origin_census_block_group == "060855006002")

mapview(example_bg_yard)

In order to align our geographic granularity with the other factors of our analysis, we take all of the yard sizes within a block group and calculate the sum of all of the yards. This value is then divided by the block group population to obtain the yard area per capita. The following map shows the yard area per capita sizes for all of the block groups in San Jose. Notice that the largest values occur near the left border of the city, near the hills.

bg_area_avg_minus <-
  sj_yard_pop_weight 

blue_pal2 <- colorNumeric(
  palette = colorRamp(c("#FFFFFF", "#00288c"), interpolate="spline"),
  domain =
    bg_area_avg_minus %>%
    pull(yard_per_capita) %>%
    unique()
)

yard_bg_map_minus <-
  leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addPolygons(
    data = sj_boundary %>% st_transform(4326),
    fill = F,
    color = "gray",
    weight = 1,
    label = ~NAME
  ) %>%
  addPolygons(
    data = bg_area_avg_minus,
    fillColor = ~blue_pal2(yard_per_capita),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    #group = "Average Daily Park Visits",
    label = ~paste0("Block Group Average Yard Size per Capita: ",round(yard_per_capita), " square feet"),
    highlightOptions =
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )) %>%
  addLegend(
    position = "topright", 
    pal = blue_pal2, 
    values = bg_area_avg_minus$yard_per_capita,
    title = "Avg. Yard Area per Capita (sqft.)",
    opacity = 1
  )

yard_bg_map_minus

Note: The data from the Santa Clara County assessor’s office likely includes multiple zoning misclassifications. For example, the following parcel has a Property Use Code of 1 (single-family residential), but it is actually a water treatment plant. This “yard” (that has an area of 1.0067310^{6} square feet) was inflating the yard size for the block group in which it is located.

mapview(nix_df[7,])

I did a visual inspection of the single-family residential parcels in the block groups with large average yard sizes (over 30,000 sqft.), and removed the parcels that appeared to be misclassified. Keep in mind that there still may be other misclassified parcels that are still being incorporated into the block group average yard size.

Some overall San Jose residential yard size statistics:

Mean San Jose Yard Area per Capita: 386

Standard Deviation: 660

Minimum San Jose Yard Size per Capita: 0

Maximum San Jose Yard Size per Capita: 6300

For now, I will but a hold on this analysis and move to the average number of people who visit San Jose parks that reside in these block groups. At the end, we will come back to these yard areas to compare against other accessibility factors.

Methodology: Obtaining CBG Visits to San Jose Parks

When looking at park visitors who live in specific census block groups (CBGs), it is not sufficient to consider the CBG visit numbers at face value. The population of each block group plays an important factor in weighing a community’s visits to parks. For example, if a CBG has 3 residents, and they have an average of 3 daily park visits, then we can infer that this community has very active park visitors (even though the 3 park visits may seem low).

The following map displays all of the census block groups we considered in this analysis (see the Safegraph Data Collection Explanation section above for why some CBGs are missing). The “Park Visits” option illustrates the average number of visits that each CBG has had to a SJ park since March 2, 2020. The “Park Visits per Population” displays a ratio, which provides more insight into how actively a community visits parks (a higher ratio = more active, a lower ratio = less active).

####periodically update from sanjose_park_process file
park_origins <- readRDS("sj_park_origins.rds") %>%
  filter(origin_census_block_group %in% sj_blockgroup$origin_census_block_group)

park_geometry <-
  readRDS('park_output_geometry.rds') %>%
  st_as_sf() %>%
  st_transform(projection)
park_origin_mean <-
  park_origins %>%
  group_by(origin_census_block_group) %>%
  summarise(mean_visits = mean(mean_origin_visits))

visit_pop_combo <-
  sj_population %>%
  filter(origin_census_block_group %in% park_origin_mean$origin_census_block_group) %>%
  left_join(
    park_origin_mean,
    by = "origin_census_block_group"
  ) %>%
  na.omit() %>%
  mutate(
    visits_per_pop = round(mean_visits/total_pop,3)
  ) %>%
  left_join(
    sj_blockgroup %>%
      dplyr::select(-DISTRICTS),
    by = "origin_census_block_group"
  ) %>%
  st_as_sf() %>%
  st_transform(projection)

orange_pal <- colorNumeric(
  palette = colorRamp(c("#fcd786", "#a67100"), interpolate="spline"),
  domain =
    visit_pop_combo %>%
    pull(mean_visits) %>%
    unique()
)

teal_pal <- colorNumeric(
  palette = colorRamp(c("#d6fffe", "#015a68"), interpolate="spline"),
  domain =
    visit_pop_combo %>%
    pull(visits_per_pop) %>%
    unique()
)

all_visitors_map <-
  leaflet() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
 addPolygons(
    data = sj_boundary %>% st_transform(4326),
    fill = F,
    color = "gray",
    weight = 1,
    label = ~NAME
  ) %>%
  addPolygons(
    data = visit_pop_combo %>%
      st_transform(4326),
    fillColor = ~orange_pal(mean_visits),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "Average Daily Park Visits",
    label = ~paste0(round(mean_visits)," average daily visits to SMC parks"),
    highlightOptions =
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )) %>%
  addPolygons(
    data = visit_pop_combo %>%
      st_transform(4326),
    fillColor = ~teal_pal(visits_per_pop),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "Average Park Visits per Population",
    label = ~paste0(visits_per_pop," average daily park visits per population"),
    highlightOptions =
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )
  ) %>%
  addLayersControl(
    baseGroups = c("Average Daily Park Visits", "Average Park Visits per Population"),
    options = layersControlOptions(collapsed = FALSE)
  )

all_visitors_map

Some interesting results I would like to highlight:

The block group that has both the highest average daily park visits and highest visit/population ratio is shown in the following view. Interestingly, this CBG contains the Santa Clara County jail. Approximately 85.8% of the residents of this block group visit a San Jose park everyday. Perhaps the inmates perform community service frequently at San Jose parks.

all_visitors_map %>%
  setView(-121.906408, 37.350420, zoom = 15)

An additional block group that stands out is one located near Emma Prusch Farm Park. This CBG contains mainly an on-ramp, a portion of Bayshore Freeway, and a storage center, but no residential area. In the “Average Daily Park Visits” view, this CBG does not stand out. But when you switch to the “Average Park Visits per Population” view, the CBG shows to have the second highest visits/population ratio in all of San Jose (although the daily park visit average is only 8, 53% of those people visit parks everyday). This may be an instance of a homeless encampment underneath or near the freeway on-ramp, whose residents visit the nearby Emma Prusch Farm Park quite frequently.

homeless <-
  all_visitors_map %>%
  showGroup("Average Park Visits per Population") %>%
  setView(-121.847157, 37.335857, zoom = 15)

homeless

Initial Findings: Communities with the Lowest and Highest Park Visits

What communities visit parks more or less frequently than others? We will now directly compare the CBGs with the lowest and highest park visits per population ratios since March 2, 2020.

We considered a low visits per population ratio to be less than 0.02, and a high ratio to be more than 0.127 (one SD less/more than the mean, respectively).

lowest_visitors <-
  visit_pop_combo %>%
  filter(visits_per_pop < mean(visit_pop_combo$visits_per_pop)-sd(visit_pop_combo$visits_per_pop)) %>%
  st_as_sf() %>%
  st_transform(projection)

purple_pal <- colorNumeric(
  palette = colorRamp(c("#b27ef2", "#4900a3"), interpolate="spline"),
  domain =
    lowest_visitors %>%
    pull(visits_per_pop) %>%
    unique()
)

highest_visitors <-
  visit_pop_combo %>%
  filter(visits_per_pop > mean(visit_pop_combo$visits_per_pop)+sd(visit_pop_combo$visits_per_pop)) %>%
  st_as_sf() %>%
  st_transform(projection)

red_pal <- colorNumeric(
  palette = colorRamp(c("#ff8585", "#9c0000"), interpolate="spline"),
  domain =
    highest_visitors %>%
    pull(visits_per_pop) %>%
    unique()
)

extremes_visitors_map <- 
  leaflet() %>% 
  addProviderTiles(providers$CartoDB.Positron) %>% 
 addPolygons(
    data = sj_boundary %>%
      st_transform(4326),
    fill = F,
    color = "gray",
    weight = 1,
    label = ~NAME
  ) %>%
  addPolygons(
    data = lowest_visitors %>%
      st_transform(4326),
    fillColor = ~purple_pal(visits_per_pop),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "CBGs with Lowest Park Visit Ratio",
    label = ~paste0(visits_per_pop," park visits per population"),
    highlightOptions = 
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )
  ) %>%
  addPolygons(
    data = highest_visitors %>%
      st_transform(4326),
    fillColor = ~red_pal(visits_per_pop),
    color = "black",
    weight = 0.5,
    opacity = 0.5,
    fillOpacity = 0.75,
    group = "CBGs with Highest Park Visit Ratio",
    label = ~paste0(visits_per_pop," park visits per population"),
    highlightOptions = 
      highlightOptions(
        weight = 2.25,
        opacity = 1
      )
  ) %>%
  addLayersControl(
    overlayGroups = c("CBGs with Lowest Park Visit Ratio", "CBGs with Highest Park Visit Ratio"),
    options = layersControlOptions(collapsed = FALSE)
  )

extremes_visitors_map
#mapview(lowest_visitors, zcol = "visits_per_pop")

In the geographic sense, there appears to be no specific CBG pattern that would allow for predictive classification of a low or high visit/population ratio.

Methodology: Distance from San Jose CBGs to a Park

Analysis still yet to come

Comparison: Residential Yard Area vs. Park Visit per Population Ratio

The following scatter plot represents a linear regression between the average residential yard size and average park visits per population for each block group in San Jose. There is a negative correlation between these two factors, representing that in general, people who have larger yards tend to visit parks less. See the linear model summary after the plot for the statistical results.

yard_visits_tied <-
  visit_pop_combo %>%
  left_join(
    sj_yard_pop_weight %>%
      st_set_geometry(NULL) %>%
      dplyr::select(yard_per_capita, origin_census_block_group),
    by = "origin_census_block_group"
  ) %>%
  filter(yard_per_capita != 0 & yard_per_capita < 2600)

lim_mod <- lm(yard_visits_tied$visits_per_pop ~ yard_visits_tied$yard_per_capita)

lim_scatter <-
  yard_visits_tied %>% 
  ggplot(
    aes(
      x = yard_per_capita,
      y = visits_per_pop
    )
  ) +
  geom_point() +
  geom_smooth(method=lm) +
  labs(
    x = "CBG Yard Area per Capita (sqft.)",
    y = "Visits per CBG Population",
    title = "SMC Park Accessibility: Parks Visits per Population vs. Residential Yard Area per Population"
  )

lim_scatter

summary(lim_mod)
## 
## Call:
## lm(formula = yard_visits_tied$visits_per_pop ~ yard_visits_tied$yard_per_capita)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.059341 -0.024112 -0.009519  0.018016  0.196233 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      6.442e-02  2.701e-03  23.851  < 2e-16 ***
## yard_visits_tied$yard_per_capita 9.804e-06  3.485e-06   2.814  0.00518 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03585 on 348 degrees of freedom
## Multiple R-squared:  0.02224,    Adjusted R-squared:  0.01943 
## F-statistic: 7.917 on 1 and 348 DF,  p-value: 0.005176

Note: In the above scatter plot, a 2,600 sqft yard area per capita threshold is implemented in order to exclude outliers. The following histogram depicts all yard area per capita values. The main curve ends at 2,300 sqft, and the outlier clusters begin around 2,900 sqft. As mentioned earlier, these large values may be attributed to zoning misclassifications, so we do not include them in the linear model.

yard_hist <- plot_ly(
  x = sj_yard_pop_weight$yard_per_capita, 
  type = "histogram") %>%
  layout(
    title = "San Jose CBG Yard Area per Capita Distribution",
    shapes=list(type='line', x0= 2600, x1= 2600, y0=0, y1=316, line=list(dash='dot', width=1)),
         xaxis = list(
           title = "Yard Area per Capita (sqft.)"
         ),
         yaxis = list(
           title = "Number of CBG Occurances"
         ),
         annotations = list(
           list(
             xref = "x",
             yref = "y",
             x = 2600,
             y = 316,
             text = "Threshold",
             xanchor = "center",
             yanchor = "bottom",
             showarrow = F
           )
         )
  )

yard_hist